package com.gglu.www;

import com.gglu.www.model.webmagic.Article;
import com.gglu.www.service.webmagic.ArticleService;
import com.gglu.www.util.SpringContextUtil;
import com.google.common.base.Strings;
import org.junit.Test;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;

import javax.annotation.Resource;
import java.util.ArrayList;
import java.util.List;

/**
 * 杭州新苗网络科技有限公司
 *
 * @author meihf
 * @create 2018/1/28
 * @description
 */
public class PageHtmlForMeiHuaWang extends BaseControllerTest implements PageProcessor{

    private static final String MEIHUA_WANG = "https://www.meihua.info/";

    private Site site = Site.me().setDomain(MEIHUA_WANG);

    @Resource
    ArticleService articleService;

    private static List<String> urlLinks = new ArrayList<>();

    private static List<Article> articleList = new ArrayList<>();

    @Test
    public void saveContentForMeiHuaWang(){
//        Spider.create(new PageHtmlForMeiHuaWang()).addUrl(MEIHUA_WANG+"/a/71032")
//                .addPipeline(new ConsolePipeline()).run();

        for (int i=62;i<=3279;i++){
            if (i%5==0){
                articleService = (ArticleService) SpringContextUtil.getBean("articleService");
                articleService.insertBatch(articleList);
                articleList = new ArrayList<>();
            }
            Spider.create(new PageHtmlForMeiHuaWang()).addUrl(MEIHUA_WANG+"/?p="+i)
                    .addPipeline(new ConsolePipeline()).run();
        }

    }

    @Override
    public void process(Page page) {

//        List<String> nextLinks = page.getHtml().$(".text-right").links().all();
//        List<String> articleLinks = new ArrayList<>();
//        if (nextLinks.size() == 0){
//           Integer id = Integer.valueOf(page.getUrl().regex("\\d+").toString());
//            articleLinks.add("https://www.meihua.info/a/"+(id-1));
//        }else {
//            articleLinks = page.getHtml().$(".text-right").links().all().subList(0,1);
//        }

        String title = page.getHtml().xpath("//div[@id='content']/div[@class='art-detail']/div[@class='title']/text()").toString();
        String summary = page.getHtml().xpath("//div[@id='content']/div[@class='art-detail']/div[@class='summary']/text()").toString();
        String art_content = page.getHtml().xpath("//div[@id='content']/div[@class='art-detail']/" +
                "div[@class='article']/div[@class='art-content']/p/tidyText()").all().toString();

        String scan =page.getHtml().xpath("//span[@class='sub-view']/span[@class='subinfo_item']/text()").regex("\\d+").toString();

        String url = page.getUrl().get();
        String workTitle = page.getHtml().xpath("//div[@class='wd_top_inner']/div[@class='wd_title']/text()").toString();
        String workArtContent = page.getHtml().xpath("//div[@class='main-left']/div[@class='wd_text']/p/tidyText()").all().toString();
        String workStart = page.getHtml().xpath("//span[@id='span_btn_works_collect']/span[@id='span_works_collectCount']/text()").toString();
        String workPraise = page.getHtml().xpath("//span[@id='span_btn_works_praise']/span[@id='span_works_praiseCount']/text()").toString();
        String workScan = page.getHtml().xpath("//div[@class='wd_top_i']/span[4]/label[2]/text()").toString();

        Article article = new Article();
        if ((!Strings.isNullOrEmpty(title) && !Strings.isNullOrEmpty(summary))){
            article.setTitle(title);
            article.setContent(art_content);
            article.setSummary(summary);
            article.setUrl(url);
            article.setScan(scan);
            articleList.add(article);
        }else if ((!Strings.isNullOrEmpty(workTitle)) && !Strings.isNullOrEmpty(workArtContent)){
            article.setContent(workArtContent);
            article.setTitle(workTitle);
            article.setScan(workScan);
            article.setStar(workStart);
            article.setUrl(url);
            article.setPraise(workPraise);
            articleList.add(article);
        }
        //先按页码把所有链接扒下来，然后集中去处理
//        List<String> links = page.getHtml().xpath("//ul[@class='works_list']/li[@class='wi_li']/a/@href").all();
//        urlLinks.addAll(links);
//
//        获取下一页
//        page.getHtml().xpath("//li[@class='page_item page-next']/parent::a").links();
//        page.addTargetRequest();

        List<String> links = page.getHtml().xpath("//div[@class='news-list-list']/ul[@class='list news-list']/" +
                "li[@class='item']/div[@class='article-img']/a/@href").all();
        page.addTargetRequests(links);

    }

    @Override
    public Site getSite() {
        return site;
    }
}
